{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 熟悉数据\n",
    "\n",
    "---\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import re\n",
    "from tqdm import tqdm\n",
    "import traceback\n",
    "import sys\n",
    "\n",
    "sys.path.insert(0, \"/home/team55/notespace/zengbin\")\n",
    "\n",
    "from jddc.config import PreConfig\n",
    "from jddc.utils import write_file, read_file, save_to_pkl, read_from_pkl, create_logger\n",
    "\n",
    "conf = PreConfig()\n",
    "logger = create_logger(name='pre', log_file=conf.log_file, cmd=conf.cmd_log)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 测试数据\n",
    "\n",
    "---\n",
    "\n",
    "单轮测试50条，每条对应10个参考答案。\n",
    "\n",
    "多轮会话5个，每个都有若干个问题，每个问题对应有10个参考答案。\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 读入测试数据\n",
    "test_q = read_file(conf.file_test_q)\n",
    "test_a = read_file(conf.file_test_a) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 拆分50条单轮测试 & 5个多轮会话测试\n",
    "test_q50 = [x for x in test_q if \"<s>\" in x]\n",
    "test_mq5 = [x for x in test_q if \"<s>\" not in x]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "q50_qs = [\" \".join((x.split(\"<s>\")[0], x.split(\"<s>\")[2], x.split(\"<s>\")[4]))  for x in test_q50]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "q50_qs_l = [len(x) for x in q50_qs]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[89,\n",
       " 45,\n",
       " 35,\n",
       " 129,\n",
       " 79,\n",
       " 76,\n",
       " 69,\n",
       " 78,\n",
       " 81,\n",
       " 59,\n",
       " 98,\n",
       " 76,\n",
       " 23,\n",
       " 51,\n",
       " 105,\n",
       " 51,\n",
       " 36,\n",
       " 46,\n",
       " 32,\n",
       " 125,\n",
       " 104,\n",
       " 60,\n",
       " 60,\n",
       " 38,\n",
       " 28,\n",
       " 66,\n",
       " 47,\n",
       " 60,\n",
       " 35,\n",
       " 31,\n",
       " 58,\n",
       " 61,\n",
       " 74,\n",
       " 50,\n",
       " 146,\n",
       " 45,\n",
       " 15,\n",
       " 121,\n",
       " 70,\n",
       " 120,\n",
       " 75,\n",
       " 101,\n",
       " 81,\n",
       " 44,\n",
       " 25,\n",
       " 54,\n",
       " 77,\n",
       " 113,\n",
       " 34,\n",
       " 45]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "q50_qs_l"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "test_a[-10:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_q50[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "re.sub('(<s>)|(，)', ' ', test_q50[0]+\"<s\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.4.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
